%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import json
from operator import itemgetter
from collections import Counter
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from statsmodels.graphics.mosaicplot import mosaic
import seaborn as sns
from colour import Color
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from lime.lime_tabular import LimeTabularExplainer
from sklearn.metrics import classification_report, confusion_matrix
def iter_dataset():
with open(filename,'rt') as fid:
for line in fid:
example = json.loads(line)
yield(example['cms_prescription_counts'],example['provider_variables'])
The data is in json format. The data is saved as a list of tuples. Each tuple consists of two dictionaries. The first dictionary provides information about the prescriptions and their counts, and the second dictionary contain information about prescription provider.
I am analyzing only those practitioners who have prescribed more than 50 drugs.
filename = 'roam_prescription_based_prediction.jsonl'
data = [(phi_dict, y_dict) for phi_dict, y_dict in iter_dataset()
if len(phi_dict)>50]
print ('Example data :\n')
data[0]
In this section we will perform logistion regression. We will use the drugs that are prescribed by the practitioners to predict targets. In our case we will use 'region', 'gender' and 'specialty' as the target, as see how well logistic regression does at predicting them. I am using gridsearch to choose the best parameters for classification.
def cross_validate(X,y,mod,param_grid={},n_fold=5):
scores = np.zeros(n_fold)
models = []
for i in range(n_fold):
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)
best_mod = mod
cross_val = GridSearchCV(mod,param_grid=param_grid,cv=n_fold,n_jobs=-1)
cross_val.fit(X_train,y_train)
best_mod = cross_val.best_estimator_
best_mod.fit(X_train,y_train)
predictions = best_mod.predict(X_test)
c_report = classification_report(y_true=y_test,y_pred=predictions)
con_mat = confusion_matrix(y_true=y_test,y_pred=predictions)
score = best_mod.score(X_test,y_test)
scores[i] = score
models.append(best_mod)
best_mod_summary = {k:best_mod.get_params()[k] for k in param_grid}
print("Fold {0:} score {1:0.03f} best parameters:{2:}".format(
(i+1),score,best_mod_summary))
return (scores,models,c_report,con_mat,X_test,y_test)
In total there are 113 specialties. However, if you notice carefully many specialties are very similar or may be even sub-category of a given specialty. But setting a threshold of 100, we limit the analysis to top 23 specialties only.
#Selecting a subset of data where there are atleast 500 practitioners for a specialty
feats, ys = zip(*data)
ys = pd.DataFrame(list(ys))#preparing targets
specialty_count = ys['specialty'].value_counts()
print("Total number of specialties in dataset = ",len(specialty_count))
specialty_subset = [x for x,c in specialty_count.items() if c>=500 ]
print("Number of specialties with 100 or more practitioners = ",len(specialty_subset))
data_subset = [(x_dict, y_dict) for x_dict,y_dict in data if y_dict['specialty'] in (specialty_subset)]
feats, ys = zip(*data_subset)
ys = pd.DataFrame(list(ys))#preparing targets
vectorizer = DictVectorizer(sparse=True)
X = vectorizer.fit_transform(feats)
X = TfidfTransformer().fit_transform(X)#preparing features
Performing logistic regression using region as targets.
from sklearn.linear_model import LogisticRegression
region_score, region_model, region_class_report, region_conf_mat, X_test, y_test = cross_validate(X,ys['region'],LogisticRegression(solver='lbfgs',multi_class='multinomial'),param_grid={'C':np.logspace(-1,3,4)})
def run_all_experiments(X,ys,mod,targets=('gender','region','specialty'),
param_grid={},n_fold=5):
summary = []
for target in targets:
y = ys[target]
this_summary = {'model':mod.__class__.__name__,
'target':target,'n_classes':len(set(y))}
print ("CURRENT TARGET : {}".format(target))
scores_t,models_t, class_report_t, conf_mat_t, X_test, y_test = cross_validate(X,y,mod,param_grid=param_grid,
n_fold=n_fold)
for i,s in enumerate(scores_t,start=1):
this_summary['score{}'.format(i)] = s
this_summary['score_mean'] = scores_t.mean()
this_summary['models'] = models_t
this_summary['class_report'] = class_report_t
this_summary['conf_matrix'] = conf_mat_t
this_summary['X_test'] = X_test
this_summary['y_test'] = y_test
summary.append(this_summary)
return(summary)
all_exp_log = run_all_experiments(X,ys,LogisticRegression(solver='lbfgs',multi_class='multinomial'),
param_grid={'C':np.logspace(-1,3,4)})
#Saving the data for future use.
import pickle
with open('all_exp_log.pkl','wb') as fid:
pickle.dump(all_exp_log,fid)
#all_exp_log = pickle.load(fid)
The table below shows the results of Logistic regression for all the three targets(region, gender and specialty). It is interesting to observe that based on the drugs prescribed it seems like it is possible to determine the gender of the practitioner. This is only because, there is a strong correlation between gender and the specialty, and not because male and female practitioners have any preference for drugs prescribed.
Below, we will look into the specialties in more detail.
import pickle
with open('all_exp_log.pkl','rb') as fid:
all_exp_log = pickle.load(fid)
all_target_table = pd.DataFrame(all_exp_log)
all_target_table
# This is only for the specialty.
target_summary = next(d for d in all_exp_log if d['target']=='specialty')
print(target_summary['class_report'])
Based on the classification report, it seems that specialities of many practitioners could be accurately predcited (Overall f1-score of 0.8, with many classes > 0.7 and almost upto 0.98). However, there are some specialties(Adult Health and Medical) that have a very poor prediction score. Let look at the confusion matrix to get an idea why this would be the case.
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource,HoverTool
from bokeh.palettes import Oranges3 as palette
from bokeh.io import output_notebook
spec_names = target_summary['models'][0].classes_.tolist()
alpha_max = np.max(target_summary['conf_matrix'])
total_spec = target_summary['conf_matrix'].sum(axis=1)
xname = []
yname = []
alpha = []
for i, t1_name in enumerate(spec_names):
for j,t2_name in enumerate(spec_names):
xname.append(t1_name)
yname.append(t2_name)
#cell_weight=(target_summary['conf_matrix'][i][i]+target_summary['conf_matrix'][j][j])/2
alpha.append((target_summary['conf_matrix'][i][j]/total_spec[i]))
target_summary['conf_matrix'].shape
source = ColumnDataSource(data=dict(xname=xname,yname=yname,alphas=alpha,
count=target_summary['conf_matrix'].flatten(),))
output_notebook()
p = figure(title="Confusion matrix",
x_axis_location="above", tools="hover,save",
x_range=list(reversed(spec_names)), y_range=spec_names)
p.plot_width = 700
p.plot_height = 700
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "9pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3
p.title.text_font_size = '18pt'
p.rect('xname', 'yname', 1.0, 1.0, source=source, alpha='alphas',hover_line_color='black',
fill_color='#c51b8a')
p.select_one(HoverTool).tooltips = [
('spec_names', '@yname, @xname'),
('count', '@count'),]
#637939, e6550d,c51b8a
show(p)
The confusion matrix now provides more idea why the f1-score for 'Adult-Health' and 'Medical' are so low. It seems that in case of many practitioners having specialty in these classes, they are being wrongly predicted as 'Family'!!!
Next, we will use LIME to understand the reason behind this mis-classification. More specifically, we will look at random individual examples for these categories. Using the weights obtained during the Logistic regression in conjugation with the drug prescribed by these individual practitioner will provide an idea behind the poor f1-scores.
In the section below I will use LIME to understand the classification made by logistic regression.
Let's first look at a specialty that is predicted very robustly by Logistic regression.
#target summary already exists
X_toarray = target_summary['X_test'].toarray()
#Pick any one of the models that is in the target summary
model_t = target_summary['models'][0]
explainer = LimeTabularExplainer(X_toarray,feature_names=vectorizer.get_feature_names(),
class_names = model_t.classes_, discretize_continuous=True)
#pick a sample from the datasets to perform LIME on
samples = np.random.choice([i for i,cls in enumerate(target_summary['y_test'].values) if cls=='Psychiatry'],size=5,replace=None)
#samples = [4085,22447,11688]
explain_all = []
for sample in samples:
print ("Sample number = ",sample)
print ("True Specialty = ",target_summary['y_test'].iloc[sample])
#run LIME
exp = explainer.explain_instance(X_toarray[sample],model_t.predict_proba,
num_features=5,top_labels=1)
explain_all.append({'sample':exp})
exp.show_in_notebook(show_table=True, show_all=False)
#exp.save_to_file('sample_'+str(sample)+'.html')
exp.save_to_file(target_summary['y_test'].iloc[sample]+str(sample)+'.html')
LIME calculates the probabilty of the sample belonging to a particular class by perturbing the values of the features and then estimating how this affects the probabilites of belonging to a class.
For the specialty 'Psychiatry', which is predicted very robust, I test 5 random samples. The right-most table shows the features that were used to make the classification and thier values. One thing to notice is that most of the prescription drugs(features) that were used by LIME to make the prediction are very similiar for this specialty(e.g. 'CLONAZEPAM','QUETIAPINE FUMARATE' etc).
#pick a sample from the datasets to perform LIME on
samples = np.random.choice([i for i,cls in enumerate(target_summary['y_test'].values) if cls=='Adult Health'],size=5,replace=None)
explain_all = []
for sample in samples:
print ("Sample number = ",sample)
print ("True Specialty = ",target_summary['y_test'].iloc[sample])
#run LIME
exp = explainer.explain_instance(X_toarray[sample],model_t.predict_proba,
num_features=5,top_labels=1)
explain_all.append({'sample':exp})
exp.show_in_notebook(show_table=True, show_all=False)
#exp.save_to_file('sample_'+str(sample)+'.html')
exp.save_to_file(target_summary['y_test'].iloc[sample]+str(sample)+'.html')
Above, I have done the LIME analysis for 5 samples randomly choosen from the specialty 'Adult Health'. The right-most table shows that the drugs(features) prescribed by practitioners belonging to this class vary a lot. Also, the prediction probabilities for these practitioners to belong to 'Adult Health' are not very high, and in many cases it is either the second or third most probable specialty.